library(tidyverse)
library(here)
library(rio)
library(janitor)
library(stringr)
library(colorblindr)
library(glue)

We are using data from the Fragile Families & Child Well-Being Study (Princeton) for this project. The study is longitudinal, with data collected from child participants, their parents/guardians, and their teachers at multiple time points (e.g., Baseline, Year 9, Year 15). More information about the study can be found at the following links:

Our original research questions were:

  1. What is the association between internalizing or externalizing behaviors at age 9 and rates of delinquent behaviors at age 15?
  2. Which delinquent behaviors are most frequent for internalizing vs. externalizing behaviors?
  3. Do race/ethnicity, gender, or other demographic characteristics impact this association?

After obtaining the dataset and beginning to work with it, we did not move forward with RQ2. In addition, we decided to explore the relationship between internalizing and externalizing behaviors at age 9 and the number of suspensions/expulsions at age 15.

The complete data file has more than 17,000 variables. We’ll use the meta-data file, which is like a data dictionary (lists all variables and associated information, such as when it was collected and an explanation of what the variable is), to select the variables of interest, and use the ff_vars vector to subset the actual data file.

For this project, we’re interested in:

  1. Demographic characteristics of participants:
    1. Self-reported race/ethnicity at age 15
    2. Mother’s report of gender at birth
  2. Scales:
    1. Self-Description Questionnaire (SDQ) - Information on internalizing and externalizing behaviors at age 9
    2. Delinquent Behavior Scale at age 9 and age 15
    3. Aggravation in parenting - collected for Baseline all years
  3. Outcomes from questionnaire:
    1. Whether or not the child was suspended/expelled from school at age 9 and 15 (parent and child), and number of times suspended or expelled at age 15 (parent and child)

Loading the Meta-Data and Subsetting

#importing meta-data
ff_meta <- import(here("data","FFMetadata_v09.csv")) %>%
    clean_names() %>%
    as_tibble()
#select scales
scales <- c("Aggravation in Parents", 
            "Self-Description Questionnaire (SDQ)",
            "Delinquent Behavior") 

#further narrow the meta-data file to variables of interest
ff_meta_subset <- ff_meta %>% 
                #demographic characteristics - race/ethnicity
                filter((topics == "Demographics" &
                        str_detect(varlab,
                                   "self-description of race/ethnicity")) |
                        #gender
                        (subtopics == "sex/gender" &
                        str_detect(varlab, "Focal baby's gender")) |
                        #scales
                        scale %in% scales|
                        #outcomes from questionnaire
                        (source == "questionnaire" & 
                        subtopics == "student experiences" &
                        str_detect(varlab,"suspen"))) %>% 
                #only include relevant columns
                select(new_name, 
                       varlab, 
                       topics, 
                       subtopics, 
                       type, 
                       scale, 
                       source, 
                       respondent, 
                       survey, 
                       wave,
                       starts_with("label"))

Our meta-data subset includes the following information:

  1. variable names corresponding to columns in the actual data set
  2. variable labels - actual question statement
  3. topics
  4. subtopic
  5. type - types of variable (continuous, categorical, etc.)
  6. scale - which scale was used. “” means no scale was used
  7. source
  8. respondent
  9. survey
  10. wave - year and wave data was collected
  11. starts_with(“label”) - description of possible responses to questions and how they’re coded.

Loading the Full Dataset and Narrowing to Variables of Interest Using Meta-Data Subset

#use variable names to subset the dataset

ff_vars <- ff_meta_subset$new_name

#Import the full, original dataset. Note this will not work for peer reviewers. We are only sharing the narrowed dataset on github. 

#ff <- import(here("data","FF_allwaves_2020v2_SPSS.sav")) %>% 
#    clean_names()

# ff_sub <- ff %>% 
#     select(idnum, #identifier 
#            all_of(ff_vars)) %>%  #our selected variables
#     as_tibble()
#let's save the subsetted df so that we don't have to clean each time

#export(ff_sub, here("data","ff_sub.Rda"))

ff_sub_orig <- import(here("data","ff_sub.Rda"))

#AW: I appended "_orig" here to retain an original version of the df

The variables (columns) have the following attributes:

  1. label: variable label
  2. names: possible responses for a question
#let's join the meta data to the subset so that we know which scale it comes from

ff_sub_long <- ff_sub_orig %>% 
    pivot_longer(
        cols = -1,
        names_to = "response"
    ) %>% 
    left_join(ff_meta, by = c("response" = "new_name"))

Plots of Self-Description Questionnnaire Responses Using nest_by()

#Let's make some example plots

plots <- ff_sub_long %>% 
    filter(scale == "Self-Description Questionnaire (SDQ)" & value >= 0) %>%
    nest_by(varlab) %>% 
    summarise(
        plot = list(
            ggplot(data, aes(x = as.factor(value))) +
            geom_bar() +
            labs(
                title = glue::glue("{varlab}")
            )   
        )
    )

walk(plots$plot, print)

Calculate scores for the internalizing and externalizing subscales

#Calculates subscores for internalizing and externalizing behaviors at age 9
ff_sub <- ff_sub_orig %>% 
    mutate(int_scores = (k5g2a + k5g2c + k5g2e + k5g2g + 
                             k5g2i + k5g2j + k5g2k + k5g2l) / 8, 
           ext_scores = (k5g2b + k5g2d + k5g2f + 
                             k5g2h + k5g2m + k5g2n) / 6) %>%
    filter(int_scores >= 0, ext_scores >= 0) 

#AW: I think that scores lower than 0 should be filtered out first. Filtering out after calculating the mean will make some participant's scores appear lower than they should be because any negative-coded values are being subtracted from other responses. E.g., let's say a participant had -3, -3, 3, 3, 3, 3. The negative responses are getting subtracted from the sum before dividing. After re-reading the instructions for calculating the subscale scores, I don't think the participant should get a subscale score if they have any negatively coded items. "When a participant responds with don’t know, refuse, or missing, to any item on a given scale, their scale score will be missing..."

#AW: I was thinking something like the code below could work but realized I don't think we want to filter/subset this way. There will be some kids with int. scores but not ext. scores and vice versa. We could (a) create separate dataframes for participants with externalizing subscale scores and participants with internalizing subscale scores or (b) decide we want to restrict the sample to only include participants that had valid scores for all scales. I might also be overthinking this.

ff_sub2 <- ff_sub_orig %>%
    filter(k5g2b >= 0 &
           k5g2d >= 0 &
           k5g2f >= 0 &
           k5g2h >= 0 &
           k5g2m >= 0 &
           k5g2n >= 0
           ) %>%
    rowwise() %>%
    mutate(sdq_externalizing = mean(c(k5g2b,
                                      k5g2d,
                                      k5g2f,
                                      k5g2h,
                                      k5g2m,
                                      k5g2n)
                                    )
           ) %>% 
    ungroup()

#AW: I replicated the externalizing x suspensions/expulsions plot using the # of times expelled/suspended as reported by parents (p6c22), filtering to remove negative values first

p <- ff_sub2 %>% 
    filter(p6c22 >= 0) %>% 
    ggplot(aes(sdq_externalizing, p6c22)) 

p + geom_smooth(method = "lm",
                    color = "magenta") +
    geom_smooth()

Subsetting: select int/ext scores, sex, ethnicity, delinquent behaviors, expulsions/suspensions and filter by valid scores (i.e., less than 0)

ff_sub_lm <- ff_sub %>% 
    rowwise() %>% 
    select(idnum, 
           starts_with("k6d6"), 
           starts_with("k5f1"), 
           int_scores, 
           ext_scores, 
           cm1bsex, 
           ck6ethrace, 
           p5l12g, 
           p6c22) %>% #AW: If you look at the ff_meta_subset df, p6c21 is a binary yes/no variable (C21. Youth ever been suspended/expelled?). I think we want p6c22 (# times reported by parent) or k6b30 (# times reported by student) instead. I changed this and subsequent instances to p6c22
    filter(rowSums(across(where(is.numeric))) >= 0 & 
               ck6ethrace >= 0 & 
               p5l12g >= 0 & 
               p6c22 >= 0) %>% 
    mutate(del_beh_9 = sum(c_across(starts_with("k5f1"))), # binary yes/no
           del_beh_15 = sum(c_across(starts_with("k6d6"))),#AW: I think this should be k6d61 (responses 1-4, with 4 being highest - see ff_meta_subset file) because the k6d62 variables are from the peer delinquency portion of the scale and have different responses (1, 2, 3), where lower scores correspond to greater delinquent behaviors (1 = often, 3 = never). I'm not sure negative responses are getting filtered out since there's a -8 in the resulting column 
           del_beh_15_self_rep = sum(c_across(starts_with("k6d61")))) #I added a column here using k6d61 for comparison

#trying out filtering zeroes out first and then joining with ff_sub_lm to compare    
ff_sub_lm2 <- ff_sub %>% 
    filter(k6d61a > 0 &
           k6d61b > 0 &
           k6d61c > 0 & 
           k6d61d > 0 &
           k6d61e > 0 &
           k6d61f > 0 & 
           k6d61g > 0 & 
           k6d61h > 0 &
           k6d61i > 0 &
           k6d61j > 0 &
           k6d61k > 0 &
           k6d61l > 0 &
           k6d61m > 0) %>% 
    rowwise() %>% 
    mutate(del_beh_15_self_rep2 = sum(c_across(starts_with("k6d61")))) %>%
    select(idnum, del_beh_15_self_rep2)

ff_sub_lm3 <- left_join(ff_sub_lm, ff_sub_lm2)

Function: calculate mean across variables

means_df <- function(df, ...) {
    means <- map(df, mean, ...) 
    nas <- map_lgl(means, is.na)
    means_l <- means[!nas] 
    as.data.frame(means_l) 
}

means_df(ff_sub)
##    cm1bsex    k5f1a   k5f1b    k5f1c    k5f1d    k5f1e    k5f1f    k5f1g
## 1 1.483629 1.848904 1.88435 1.908081 1.926705 1.689396 1.945329 1.919796
##      k5f1h    k5f1i    k5f1j    k5f1k    k5f1l    k5f1m    k5f1n    k5f1o
## 1 1.974166 1.975068 1.953139 1.990388 1.985882 1.816161 1.963352 1.981376
##      k5f1p    k5f1q    k5g2a     k5g2b    k5g2c     k5g2d     k5g2e    k5g2f
## 1 1.929709 1.943827 1.048363 0.8269751 1.528687 0.9008711 0.7065185 1.219585
##       k5g2g     k5g2h    k5g2i    k5g2j     k5g2k   k5g2l    k5g2m     k5g2n
## 1 0.6996095 0.7894263 1.455392 1.438871 0.8918594 1.42295 1.082908 0.6800841
##      p5l12g     p6c21     p6c22 ck6ethrace     k6b29     k6b30    k6d61a
## 1 0.6794833 0.9933914 -3.956143   1.101532 0.7026134 -4.028237 0.1685191
##      k6d61b    k6d61c    k6d61d    k6d61e    k6d61f    k6d61g    k6d61h
## 1 0.1952538 0.2258937 0.4466807 0.2427155 0.1628117 0.1559027 0.1432863
##      k6d61i    k6d61j    k6d61k    k6d61l    k6d61m  k6d62a   k6d62b   k6d62c
## 1 0.1459898 0.1595074 0.2349054 0.2847702 0.4629018 1.63833 1.497146 1.407029
##     k6d62d   k6d62e   k6d62f   k6d62g   k6d62h   k6d62i   k6d62j   k6d62k
## 1 1.721538 1.796636 1.880144 1.748573 1.739862 1.831481 1.735957 1.639231
##   int_scores ext_scores
## 1   1.149031  0.9166416

Linear Model functions

ff_sub_lm$idnum <- as.numeric(ff_sub_lm$idnum)

ff_sub_lm$ck6ethrace <- as.numeric(ff_sub_lm$ck6ethrace)

lm_mods <- function (vardep, varindep1, varindep2, varindep3, DATA) {
  summary(lm(paste(vardep, "~", 
                   varindep1, "+", 
                   varindep2, "+", 
                   varindep3), 
             data = DATA))
  }

lm_mods("del_beh_15", "ext_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
## 
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+", 
##     varindep3), data = DATA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.470  -0.928   1.403   2.656  16.614 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  44.8834     1.0911  41.136   <2e-16 ***
## ext_scores    0.2531     0.3175   0.797    0.426    
## cm1bsex       0.3527     0.5022   0.702    0.483    
## ck6ethrace   -0.5726     0.2916  -1.964    0.050 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.326 on 671 degrees of freedom
## Multiple R-squared:  0.007143,   Adjusted R-squared:  0.002704 
## F-statistic: 1.609 on 3 and 671 DF,  p-value: 0.186
lm_mods("del_beh_15", "int_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
## 
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+", 
##     varindep3), data = DATA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.252  -0.939   1.402   2.636  16.410 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  45.5536     1.0589  43.018   <2e-16 ***
## int_scores   -0.2252     0.3332  -0.676    0.499    
## cm1bsex       0.2896     0.4966   0.583    0.560    
## ck6ethrace   -0.5705     0.2918  -1.955    0.051 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.327 on 671 degrees of freedom
## Multiple R-squared:  0.006878,   Adjusted R-squared:  0.002438 
## F-statistic: 1.549 on 3 and 671 DF,  p-value: 0.2006
lm_mods("del_beh_9", "int_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
## 
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+", 
##     varindep3), data = DATA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -65.193  -0.807   0.655   1.555   3.340 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 31.11093    0.55734  55.821  < 2e-16 ***
## int_scores  -0.60890    0.17539  -3.472 0.000550 ***
## cm1bsex      1.01349    0.26138   3.877 0.000116 ***
## ck6ethrace   0.02879    0.15357   0.187 0.851349    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.33 on 671 degrees of freedom
## Multiple R-squared:  0.03924,    Adjusted R-squared:  0.03495 
## F-statistic: 9.135 on 3 and 671 DF,  p-value: 6.235e-06
lm_mods("del_beh_9", "ext_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
## 
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+", 
##     varindep3), data = DATA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -64.839  -0.650   0.453   1.314   3.805 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  32.5042     0.5493  59.178  < 2e-16 ***
## ext_scores   -1.3917     0.1598  -8.708  < 2e-16 ***
## cm1bsex       0.6942     0.2528   2.746  0.00619 ** 
## ck6ethrace   -0.0201     0.1468  -0.137  0.89113    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.185 on 671 degrees of freedom
## Multiple R-squared:  0.1213, Adjusted R-squared:  0.1174 
## F-statistic: 30.87 on 3 and 671 DF,  p-value: < 2.2e-16
lm_mods("p5l12g", "ext_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
## 
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+", 
##     varindep3), data = DATA)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.95410  0.07816  0.14878  0.20082  0.31582 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.78997    0.06461  27.705  < 2e-16 ***
## ext_scores  -0.06245    0.01880  -3.322 0.000943 ***
## cm1bsex      0.06021    0.02974   2.025 0.043263 *  
## ck6ethrace   0.01093    0.01727   0.633 0.527141    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3746 on 671 degrees of freedom
## Multiple R-squared:  0.02639,    Adjusted R-squared:  0.02204 
## F-statistic: 6.062 on 3 and 671 DF,  p-value: 0.0004487
lm_mods("p5l12g", "int_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
## 
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+", 
##     varindep3), data = DATA)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9149  0.1050  0.1603  0.1979  0.2609 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.73262    0.06309  27.462   <2e-16 ***
## int_scores  -0.03149    0.01985  -1.586   0.1132    
## cm1bsex      0.07448    0.02959   2.517   0.0121 *  
## ck6ethrace   0.01326    0.01738   0.763   0.4460    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.377 on 671 degrees of freedom
## Multiple R-squared:  0.01408,    Adjusted R-squared:  0.009668 
## F-statistic: 3.193 on 3 and 671 DF,  p-value: 0.02313
lm_mods("p6c22", "ext_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
## 
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+", 
##     varindep3), data = DATA)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.300 -1.604 -1.047  0.412 37.581 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  1.95271    0.59566   3.278   0.0011 **
## ext_scores   0.42653    0.17332   2.461   0.0141 * 
## cm1bsex     -0.01671    0.27415  -0.061   0.9514   
## ck6ethrace   0.02815    0.15920   0.177   0.8597   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.454 on 671 degrees of freedom
## Multiple R-squared:  0.009235,   Adjusted R-squared:  0.004805 
## F-statistic: 2.085 on 3 and 671 DF,  p-value: 0.1009
lm_mods("p6c22", "ext_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
## 
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+", 
##     varindep3), data = DATA)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.300 -1.604 -1.047  0.412 37.581 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  1.95271    0.59566   3.278   0.0011 **
## ext_scores   0.42653    0.17332   2.461   0.0141 * 
## cm1bsex     -0.01671    0.27415  -0.061   0.9514   
## ck6ethrace   0.02815    0.15920   0.177   0.8597   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.454 on 671 degrees of freedom
## Multiple R-squared:  0.009235,   Adjusted R-squared:  0.004805 
## F-statistic: 2.085 on 3 and 671 DF,  p-value: 0.1009
mod_db_int_15 <- ff_sub_lm %>%
    group_by(idnum) %>%
    nest() %>%
    mutate(
        model = map(
            data, ~lm(del_beh_15 ~ int_scores + cm1bsex + ck6ethrace, data = .x)
        )
    )

mod_db_ext_15 <- ff_sub_lm %>%
    group_by(idnum) %>%
    nest() %>%
    mutate(
        model = map(
            data, ~lm(del_beh_15 ~ ext_scores + cm1bsex + ck6ethrace, data = .x)
        )
    )

mod_db_int_9 <- ff_sub_lm %>%
    group_by(idnum) %>%
    nest() %>%
    mutate(
        model = map(
            data, ~lm(del_beh_9 ~ int_scores + cm1bsex + ck6ethrace, data = .x)
        )
    )

mod_db_ext_9 <- ff_sub_lm %>%
    group_by(idnum) %>%
    nest() %>%
    mutate(
        model = map(
            data, ~lm(del_beh_9 ~ ext_scores + cm1bsex + ck6ethrace, data = .x)
        )
    )

mod_expulsion9_int <- ff_sub_lm %>%
    group_by(idnum) %>%
    nest() %>%
    mutate(
        model = map(
            data, ~lm(p5l12g ~ int_scores + cm1bsex + ck6ethrace, data = .x)
        )
    )

mod_expulsion15_ext <- ff_sub_lm %>%
    group_by(idnum) %>%
    nest() %>%
    mutate(
        model = map(
            data, ~lm(p6c22 ~ ext_scores + cm1bsex + ck6ethrace, data = .x)
        )
    )

mod_expulsion9_ext <- ff_sub_lm %>%
    group_by(idnum) %>%
    nest() %>%
    mutate(
        model = map(
            data, ~lm(p5l12g ~ ext_scores + cm1bsex + ck6ethrace, data = .x)
        )
    )

mod_expulsion15_int <- ff_sub_lm %>%
    group_by(idnum) %>%
    nest() %>%
    mutate(
        model = map(
            data, ~lm(p6c22 ~ int_scores + cm1bsex + ck6ethrace, data = .x)
        )
    )

pull_coef <- function(model, coef_name) {
    coef(model)[coef_name]
}

mod_db_ext_15 %>%
    mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups:   idnum [675]
##    idnum data              model  intercept$`(Intercept)` $ext_scores $cm1bsex
##    <dbl> <list>            <list>                   <dbl>       <dbl>    <dbl>
##  1     8 <tibble [1 × 50]> <lm>                        47          NA       NA
##  2    14 <tibble [1 × 50]> <lm>                        -8          NA       NA
##  3    19 <tibble [1 × 50]> <lm>                        44          NA       NA
##  4    25 <tibble [1 × 50]> <lm>                        41          NA       NA
##  5    28 <tibble [1 × 50]> <lm>                        46          NA       NA
##  6    30 <tibble [1 × 50]> <lm>                        46          NA       NA
##  7    33 <tibble [1 × 50]> <lm>                        48          NA       NA
##  8    40 <tibble [1 × 50]> <lm>                        47          NA       NA
##  9    48 <tibble [1 × 50]> <lm>                        46          NA       NA
## 10    52 <tibble [1 × 50]> <lm>                        44          NA       NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mod_db_int_15 %>%
    mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups:   idnum [675]
##    idnum data              model  intercept$`(Intercept)` $int_scores $cm1bsex
##    <dbl> <list>            <list>                   <dbl>       <dbl>    <dbl>
##  1     8 <tibble [1 × 50]> <lm>                        47          NA       NA
##  2    14 <tibble [1 × 50]> <lm>                        -8          NA       NA
##  3    19 <tibble [1 × 50]> <lm>                        44          NA       NA
##  4    25 <tibble [1 × 50]> <lm>                        41          NA       NA
##  5    28 <tibble [1 × 50]> <lm>                        46          NA       NA
##  6    30 <tibble [1 × 50]> <lm>                        46          NA       NA
##  7    33 <tibble [1 × 50]> <lm>                        48          NA       NA
##  8    40 <tibble [1 × 50]> <lm>                        47          NA       NA
##  9    48 <tibble [1 × 50]> <lm>                        46          NA       NA
## 10    52 <tibble [1 × 50]> <lm>                        44          NA       NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mod_db_ext_9 %>%
    mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups:   idnum [675]
##    idnum data              model  intercept$`(Intercept)` $ext_scores $cm1bsex
##    <dbl> <list>            <list>                   <dbl>       <dbl>    <dbl>
##  1     8 <tibble [1 × 50]> <lm>                        34          NA       NA
##  2    14 <tibble [1 × 50]> <lm>                        33          NA       NA
##  3    19 <tibble [1 × 50]> <lm>                        33          NA       NA
##  4    25 <tibble [1 × 50]> <lm>                        29          NA       NA
##  5    28 <tibble [1 × 50]> <lm>                        33          NA       NA
##  6    30 <tibble [1 × 50]> <lm>                        34          NA       NA
##  7    33 <tibble [1 × 50]> <lm>                        32          NA       NA
##  8    40 <tibble [1 × 50]> <lm>                        33          NA       NA
##  9    48 <tibble [1 × 50]> <lm>                        32          NA       NA
## 10    52 <tibble [1 × 50]> <lm>                        28          NA       NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mod_db_int_9 %>%
    mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups:   idnum [675]
##    idnum data              model  intercept$`(Intercept)` $int_scores $cm1bsex
##    <dbl> <list>            <list>                   <dbl>       <dbl>    <dbl>
##  1     8 <tibble [1 × 50]> <lm>                        34          NA       NA
##  2    14 <tibble [1 × 50]> <lm>                        33          NA       NA
##  3    19 <tibble [1 × 50]> <lm>                        33          NA       NA
##  4    25 <tibble [1 × 50]> <lm>                        29          NA       NA
##  5    28 <tibble [1 × 50]> <lm>                        33          NA       NA
##  6    30 <tibble [1 × 50]> <lm>                        34          NA       NA
##  7    33 <tibble [1 × 50]> <lm>                        32          NA       NA
##  8    40 <tibble [1 × 50]> <lm>                        33          NA       NA
##  9    48 <tibble [1 × 50]> <lm>                        32          NA       NA
## 10    52 <tibble [1 × 50]> <lm>                        28          NA       NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mod_expulsion15_ext %>%
    mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups:   idnum [675]
##    idnum data              model  intercept$`(Intercept)` $ext_scores $cm1bsex
##    <dbl> <list>            <list>                   <dbl>       <dbl>    <dbl>
##  1     8 <tibble [1 × 50]> <lm>                         2          NA       NA
##  2    14 <tibble [1 × 50]> <lm>                         0          NA       NA
##  3    19 <tibble [1 × 50]> <lm>                         0          NA       NA
##  4    25 <tibble [1 × 50]> <lm>                         1          NA       NA
##  5    28 <tibble [1 × 50]> <lm>                         1          NA       NA
##  6    30 <tibble [1 × 50]> <lm>                         0          NA       NA
##  7    33 <tibble [1 × 50]> <lm>                         2          NA       NA
##  8    40 <tibble [1 × 50]> <lm>                         2          NA       NA
##  9    48 <tibble [1 × 50]> <lm>                         1          NA       NA
## 10    52 <tibble [1 × 50]> <lm>                        20          NA       NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mod_expulsion15_int%>%
    mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups:   idnum [675]
##    idnum data              model  intercept$`(Intercept)` $int_scores $cm1bsex
##    <dbl> <list>            <list>                   <dbl>       <dbl>    <dbl>
##  1     8 <tibble [1 × 50]> <lm>                         2          NA       NA
##  2    14 <tibble [1 × 50]> <lm>                         0          NA       NA
##  3    19 <tibble [1 × 50]> <lm>                         0          NA       NA
##  4    25 <tibble [1 × 50]> <lm>                         1          NA       NA
##  5    28 <tibble [1 × 50]> <lm>                         1          NA       NA
##  6    30 <tibble [1 × 50]> <lm>                         0          NA       NA
##  7    33 <tibble [1 × 50]> <lm>                         2          NA       NA
##  8    40 <tibble [1 × 50]> <lm>                         2          NA       NA
##  9    48 <tibble [1 × 50]> <lm>                         1          NA       NA
## 10    52 <tibble [1 × 50]> <lm>                        20          NA       NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mod_expulsion9_ext %>%
    mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups:   idnum [675]
##    idnum data              model  intercept$`(Intercept)` $ext_scores $cm1bsex
##    <dbl> <list>            <list>                   <dbl>       <dbl>    <dbl>
##  1     8 <tibble [1 × 50]> <lm>                         2          NA       NA
##  2    14 <tibble [1 × 50]> <lm>                         2          NA       NA
##  3    19 <tibble [1 × 50]> <lm>                         2          NA       NA
##  4    25 <tibble [1 × 50]> <lm>                         1          NA       NA
##  5    28 <tibble [1 × 50]> <lm>                         1          NA       NA
##  6    30 <tibble [1 × 50]> <lm>                         2          NA       NA
##  7    33 <tibble [1 × 50]> <lm>                         2          NA       NA
##  8    40 <tibble [1 × 50]> <lm>                         2          NA       NA
##  9    48 <tibble [1 × 50]> <lm>                         2          NA       NA
## 10    52 <tibble [1 × 50]> <lm>                         1          NA       NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mod_expulsion9_int%>%
    mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups:   idnum [675]
##    idnum data              model  intercept$`(Intercept)` $int_scores $cm1bsex
##    <dbl> <list>            <list>                   <dbl>       <dbl>    <dbl>
##  1     8 <tibble [1 × 50]> <lm>                         2          NA       NA
##  2    14 <tibble [1 × 50]> <lm>                         2          NA       NA
##  3    19 <tibble [1 × 50]> <lm>                         2          NA       NA
##  4    25 <tibble [1 × 50]> <lm>                         1          NA       NA
##  5    28 <tibble [1 × 50]> <lm>                         1          NA       NA
##  6    30 <tibble [1 × 50]> <lm>                         2          NA       NA
##  7    33 <tibble [1 × 50]> <lm>                         2          NA       NA
##  8    40 <tibble [1 × 50]> <lm>                         2          NA       NA
##  9    48 <tibble [1 × 50]> <lm>                         2          NA       NA
## 10    52 <tibble [1 × 50]> <lm>                         1          NA       NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mods <- function(data, x, y, points = FALSE, ...) {
    p <- ggplot(data, aes({{x}}, {{y}})) 
    p + geom_smooth(method = "lm",
                    color = "magenta", 
                    ...) +
        geom_smooth(...) 
}

#AW: Do we want two fitted lines on the graphs? If so, it might be helpful to include a short note about the purpose of the two lines. 

Internalizing year 9 x delinquency behaviors in year 15

mods(ff_sub_lm, int_scores, del_beh_15) +
    labs(title = "Relationship between Internalizing year 9 x suspensions/expulsions in year 15",
         x = "Internalizing behavior score",
         y = "Delinquency behaviors")

Externalizing behaviors year 9 x delinquency behaviors in year 15

mods(ff_sub_lm, ext_scores, del_beh_15) +
    labs(title = "Relationship between Externalizing year 9 x suspensions/expulsions in year 15",
         x = "Externalizing behavior score",
         y = "Delinquency behaviors")

Internalizing year 9 x delinquency behaviors in year 15

mods(ff_sub_lm, int_scores, del_beh_9) +
    labs(title = "Relationship between Internalizing year 9 x suspensions/expulsions in year 9",
         x = "Internalizing behavior score",
         y = "Delinquency behaviors")

Externalizing behaviors year 9 x delinquency behaviors in year 15

mods(ff_sub_lm, ext_scores, del_beh_9) +
    labs(title = "Relationship between Externalizing year 9 x suspensions/expulsions in year 9",
         x = "Externalizing behavior score",
         y = "Delinquency behaviors")

Internalizing year 9 x suspensions/expulsions in year 9

mods(ff_sub_lm, int_scores, p5l12g) +
    labs(title = "Relationship between Internalizing year 9 x suspensions/expulsions in year 9",
         x = "Internalizing behavior score",
         y = "suspensions/expulsions")

#AW: p5l12g is a binary yes/no variable. It looks like the # of times the participant was suspended/expelled wasn't collected in Year 9, just a yes/no whether they were or were not. 

Externalizing behaviors year 9 x suspensions/expulsions in year 9

mods(ff_sub_lm, ext_scores, p5l12g) +
    labs(title = "Relationship between Externalizing year 9 x suspensions/expulsions in year 9",
         x = "Externalizing behavior score",
         y = "suspensions/expulsions")

#AW: p5l12g is a binary yes/no variable. It looks like the # of times the participant was suspended/expelled wasn't collected in Year 9, just a yes/no whether they were or were not. 

Internalizing year 9 x suspensions/expulsions in year 15

mods(ff_sub_lm, int_scores, p6c22) +
    labs(title = "Relationship between Internalizing year 9 x suspensions/expulsions in year 15",
         x = "Internalizing behavior score",
         y = "suspensions/expulsions")

Externalizing behaviors year 9 x suspensions/expulsions in year 15

mods(ff_sub_lm, ext_scores, p6c22) +
    labs(title = "Relationship between Externalizing year 9 x suspensions/expulsions in year 15",
         x = "Externalizing behavior score",
         y = "suspensions/expulsions")

Mean Suspensions/Expulsions and Distributions of Subscale Scores and Suspensions/Explusions Using purrr::nest %>% mutate()

This section is still in progress

cm1bsex = gender of participant, as reported by mother during baseline data collection

  • 1 = male
  • 2 = female

ck6ethrace = race/ethnicity of participant, self-reported during Wave 6 / Year 15

  • 1 = White only, non-hispanic
  • 2 = Black/Af. American only, non-hispanic
  • 3 = Hispanic/Latino
  • 4 = Other only, non-hispanic
  • 5 = Multi-racial, non-hispanic
  • Negative values = don’t know, missing, refused, and not in wave

p6c22 = Number of times youth has been suspended/expelled past two years as reported by primary caregiver in Wave 6/Year 15

# First step - recode race/eth
ethrace <- ff_sub_lm %>%
    mutate(ck6ethrace = recode(ck6ethrace, 
                               "1" = "White",
                               "2" = "Black",
                               "3" = "Hispanic/Latino",
                               "4" = "Other", 
                               "5" = "Multiracial"))

# Beginning to play around with nest() %>% mutate. Here, I calculated the avg. suspensions/expulsions reported by primary caregiver for Year 15 for each race/eth category and generated a bar chart w/ displaying the mean suspensions/expulsion by race/ethnicity subgroup

by_ethrace <- ethrace %>% 
    group_by(ck6ethrace) %>%
    nest() %>% 
    mutate(
        avg_sus_exp = map_dbl(data, ~mean(.x$p6c22)),
    ) 

by_ethrace %>% 
    ggplot(aes(y = avg_sus_exp, x = ck6ethrace)) +
    geom_col()

I then used mutate() with the nested dataframe and map() to generate distributions for the following continuous variable by race/ethnicity:

  • internalizing behavior subscale scores of participants at age 9
  • externalizing behavior subscale scores of participants at age 9,
  • delinquent behavior subscale scores at age 15, and
  • number of suspensions and expulsions at age 15 - all by race/ethnicity.

These are definitely rough drafts. If we include these in our final product, my to-do list would include:

  • theme_minimal()
  • consistent x- and y-axis scaling, breaks, and clean labels
  • use of color and alpha
  • minimal plot elements, such as removing axis grid lines
  • get rid of the expansions above the x- and y-axis

I’d also like to replicate these for gender.

# Distributions of internalizing subscale scores at age 9 by race/ethnicity 

by_ethrace <- by_ethrace %>% 
    mutate(
      distributions_int_scores = map(
          data, ~{
          ggplot(.x, aes(int_scores)) +
                geom_bar() + 
                labs(
          title = "Distribution of Age 9 Internalizing Behavior Scores",
          subtitle = glue("{ck6ethrace} Participants"),
          x = "Internalizing Behavior Subscale Score",
          y = ""
          )
      }
    )
  )
    
#by_ethrace$distributions_int_scores[[1]]

walk(by_ethrace$distributions_int_scores[1:5], print)

# Distributions of externalizing subscale scores at age 9 by race/ethnicity 

by_ethrace <- by_ethrace %>% 
    mutate(
      distributions_ext_scores = map(
          data, ~{
          ggplot(.x, aes(ext_scores)) +
                geom_bar() + 
                labs(
          title = "Distribution of Age 9 Externalizing Behavior Scores",
          subtitle = glue("{ck6ethrace} Participants"),
          x = "Externalizing Behavior Subscale Score",
          y = ""
          )
      }
    )
  )
    
#by_ethrace$distributions_ext_scores[[1]]

walk(by_ethrace$distributions_ext_scores[1:5], print)

#Distributions of delinquent behavior subscale scores reported at age 15 by race/ethnicity subgroup

by_ethrace <- by_ethrace %>% 
    mutate(
      distributions_del_beh_15 = map(
          data, ~{
          ggplot(.x, aes(del_beh_15_self_rep)) +
                geom_bar() + 
                labs(
          title = "Distribution of Age 15 Delinquent Behavior Scores",
          subtitle = glue("{ck6ethrace} Participants"),
          x = "Delinquent Behavior Total Score",
          y = ""
          )
      }
    )
  )
    
#by_ethrace$distributions_del_beh_15[[1]]

walk(by_ethrace$distributions_del_beh_15[1:5], print)

#Distributions of reported suspensions and expulsions reported at age 15. 

by_ethrace <- by_ethrace %>% 
    mutate(
      distributions_susp_exp_15 = map(
          data, ~{
          ggplot(.x, aes(p6c22)) +
                geom_bar() + 
                labs(
          title = "Distribution of Suspensions & Explusions at Age 15",
          subtitle = glue("{ck6ethrace} Participants"),
          x = "Reported Suspensions and Explusions",
          y = ""
          )
      }
    )
  )
    
#by_ethrace$distributions_susp_exp_15[[1]]

walk(by_ethrace$distributions_susp_exp_15[1:5], print)

Parallel interations for scatterplots

ff_sub_lm1 <- ff_sub_lm %>% 
  select( int_scores, ext_scores, cm1bsex, ck6ethrace, 
          del_beh_9, del_beh_15, p5l12g, p6c22)

ff_sub_lm1$cm1bsex <- factor(ff_sub_lm1$cm1bsex, labels = c("male","female"))

ff_sub_lm1$ck6ethrace <- as.factor(ff_sub_lm1$ck6ethrace)

ff_sub_lm1$ck6ethrace <- recode (ff_sub_lm1$ck6ethrace, "1" = "white",
                                                        "2" =  "black",
                                                        "3" = "hispanic",
                                                        "4" = "other",
                                                        "5" = "multi-racial")

#I tried to use the following code with *map* but could not make it work

#ff_sub_lm <- map(ff_sub_lm, ~{
 # .x %>% 
   # mutate(cm1bsex = factor(cm1bsex, labels = c("male","female")),
         #  ck6ethrace = factor(ck6ethrace, labels = c("white",
                                                       # "black",
                                                      #  "hispanic",
                                                        #"other",
                                                       # "multi-racial"))) 
#})

#Function for a scatterplot with a fitted regression line 
scatter1 <- function(df, DV, IV, group) {
 var1 <- deparse(substitute(DV))
 var2 <- deparse(substitute(IV))
 
 p = df %>% 
   ggplot() +
   geom_point(aes(x = IV, y = DV), color = "gray50", stroke = 0, alpha = .6) +
   geom_smooth(method = lm, se = FALSE, 
               aes(x = IV, y = DV, color = group)) +
   scale_y_continuous(expand = c(0,0), 
                     breaks = c(35, 40, 45, 50)) +
  coord_cartesian(ylim = c(35, 55 )) +
  theme_minimal(15) +
  labs(x = print(var2),
       y = print(var1)) +
  theme(plot.title.position = "plot",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank())
 
  ifelse(var2 == "ff_sub_lm1$int_scores",
        p <- p + labs(title = "Delinquent Behavior at 15 predicted by Internalizing scores at 9"),
        ifelse (var2 == "ff_sub_lm1$ext_scores",
                p <- p + labs(title = "Delinquent Behavior at 15 predicted by Externalizing scores at 9"),
                p <- p))

 p
}


#Del. Behavior at 15 by Internalizing scores at 9, grouped by Race
scatter1(ff_sub_lm1, ff_sub_lm1$del_beh_15, ff_sub_lm1$int_scores, ff_sub_lm1$ck6ethrace)
## [1] "ff_sub_lm1$int_scores"
## [1] "ff_sub_lm1$del_beh_15"

#Del. Behavior at 15 by Internalizing scores at 9, grouped by Gender
scatter1(ff_sub_lm1, ff_sub_lm1$del_beh_15, ff_sub_lm1$int_scores, ff_sub_lm1$cm1bsex)
## [1] "ff_sub_lm1$int_scores"
## [1] "ff_sub_lm1$del_beh_15"

#Del. Behavior at 15 by Externalizing scores at 9, grouped by Race
scatter1(ff_sub_lm1, ff_sub_lm1$del_beh_15, ff_sub_lm1$ext_scores, ff_sub_lm1$ck6ethrace)
## [1] "ff_sub_lm1$ext_scores"
## [1] "ff_sub_lm1$del_beh_15"

#Del. Behavior at 15 by Externalizing scores at 9, grouped by Gender
scatter1(ff_sub_lm1, ff_sub_lm1$del_beh_15, ff_sub_lm1$ext_scores, ff_sub_lm1$cm1bsex)
## [1] "ff_sub_lm1$ext_scores"
## [1] "ff_sub_lm1$del_beh_15"

#Plots for both internalizing and externalizing behavior grouped by gender
map(ff_sub_lm1[1:2],
                          ~{scatter1(ff_sub_lm1, ff_sub_lm1$del_beh_15, .x, ff_sub_lm1$cm1bsex)})
## [1] ".x"
## [1] "ff_sub_lm1$del_beh_15"
## [1] ".x"
## [1] "ff_sub_lm1$del_beh_15"
## $int_scores

## 
## $ext_scores

#Plots for both internalizing and externalizing behavior grouped by gender
map(ff_sub_lm1[1:2],
                          ~{scatter1(ff_sub_lm1, ff_sub_lm1$del_beh_15, .x, ff_sub_lm1$ck6ethrace)})
## [1] ".x"
## [1] "ff_sub_lm1$del_beh_15"
## [1] ".x"
## [1] "ff_sub_lm1$del_beh_15"
## $int_scores

## 
## $ext_scores

#Function for a scatterplot with a fitted regression line (Internalizing behavior * delinquency)
scatter2 <- function(df, group) {
 p = df %>% 
   ggplot() +
   geom_point(aes(x = int_scores, y = del_beh_15), color = "gray50", 
              stroke = 0, alpha = .6) +
   geom_smooth(method = lm, se = FALSE, 
               aes(x = int_scores, y = del_beh_15, color = cm1bsex)) +
   scale_y_continuous(expand = c(0,0), 
                     breaks = c(35, 40, 45, 50)) +
  coord_cartesian(ylim = c(35, 55 )) +
  theme_minimal(15) +
  scale_color_OkabeIto(name = "Gender") +
  theme(plot.title.position = "plot",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        title =element_text(size=8))
 
 p + labs (title = paste("Delinquent Behavior at 15 predicted by Internalizing scores at 9", group, sep = ": "))
}

nest_df = ff_sub_lm1 %>% 
  group_by(ck6ethrace) %>% 
  nest()

plots2 <- map2(nest_df$data, nest_df$ck6ethrace,
                          ~scatter2(.x, .y))

ggpubr::ggarrange(plots2[[1]], plots2[[2]], plots2[[3]], plots2[[4]], plots2[[5]],
                  ncol = 2, nrow = 3,
                  common.legend = TRUE,
                  legend = 'bottom')

#Function for a scatterplot with a fitted regression line (Externalizing behavior * delinquency)
scatter3 <- function(df, group) {
 p = df %>% 
   ggplot() +
   geom_point(aes(x = ext_scores, y = del_beh_15), color = "gray50", stroke = 0, alpha = .6) +
   geom_smooth(method = lm, se = FALSE, 
               aes(x = ext_scores, y = del_beh_15, color = ck6ethrace)) +
   scale_y_continuous(expand = c(0,0), 
                     breaks = c(35, 40, 45, 50))+
  coord_cartesian(ylim = c(35, 55 )) +
  theme_minimal(15) +
  scale_color_OkabeIto(name = "Race/Ethnicity") +
  theme(plot.title.position = "plot",
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        title =element_text(size=8))
 
 p + labs (title = paste("Delinquent Behavior at 15 predicted by Externalizing scores at 9", group, sep = ": "))
}

nest_df = ff_sub_lm1 %>% 
  group_by(cm1bsex) %>% 
  nest()

plots3 <- map2(nest_df$data, nest_df$cm1bsex,
                          ~scatter3(.x, .y))

ggpubr::ggarrange(plots3[[1]], plots3[[2]], 
                  ncol = 2, nrow = 1,
                  common.legend = TRUE,
                  legend = 'bottom')